% !TEX program = xelatex
%%%%%%%%%%%%%%%%%%%%%%%%

\input{preamble.tex}
\otherslide

\subtitle{Introduction}

\begin{document}
\maketitle

\introslide

\begin{frame}{\outline}
    \vspace{10pt}
    \fat{Part 1: Introduction} 
    \vspace{5pt}
    \blist
        \item Multi-agent systems
        \item Multi-agent RL in multi-agent systems
        \item Challenges of MARL   
    \elist
    \vspace{10pt}
    \fat{Part 2: Reinforcement Learning Basics} 
    \vspace{5pt}
    \blist
        \item Markov decision processes
        \item Discounted returns
        \item Dynamic programming and temporal-difference learning
    \elist
\end{frame}

\section{Part 1: Introduction}

\begin{frame}{What is MARL?}
		\textbf{\textit{Multi-agent reinforcement learning (MARL) is about finding optimal decision policies for two or more artificial agents interacting in a shared environment.}}
  \vspace{20pt}
  \blist
    \item Applying reinforcement learning (RL) algorithms to multi-agent systems
    \item Goal is to learn optimal policies for two or more agents
  \elist
\end{frame}

\begin{frame}[t]{MARL Applications}
    \centering
    % Top left image
    \begin{minipage}{.5\linewidth}
        \centering
        \includegraphics[width=.65\linewidth, keepaspectratio]{images/1_starcraft.png}
        
        Computer games
    \end{minipage}%
    % Top right image
    \begin{minipage}{.5\linewidth}
        \centering
        \includegraphics[width=.65\linewidth, keepaspectratio]{images/1_self_driving.png}
        
        Autonomous driving
    \end{minipage}
    
    % Bottom left image
    \begin{minipage}{.5\linewidth}
        \centering
        \includegraphics[width=.65\linewidth, keepaspectratio]{images/1_reware.png}
        
        Multi-robot warehouses
    \end{minipage}%
    % Bottom right image
    \begin{minipage}{.5\linewidth}
        \centering
        \includegraphics[width=.65\linewidth, keepaspectratio]{images/1_trading.png}
        
        Automated trading
    \end{minipage}

\end{frame}

\begin{frame}{Mutli-Agent Systems}
	\begin{graytitlebox}{A multi-agent system consists of:}
    \blist
        \item<1-> \textbf{Environment:} The environment is a physical or virtual world whose state evolves and is influenced by the agents' actions within the environment.
        
        \item<2-> \textbf{Agents:} An agent is an entity which receives information about the state of the environment and can choose actions to influence the state.
        \listtab Agents are goal-directed, e.g. maximizing returns
    \elist
    \end{graytitlebox}
\end{frame}

% Need to edit potentially using columns. 
\begin{frame}{Multi-Agent Systems}		
    \begin{figure}
        \centering
        \includegraphics[width=\textwidth,height=.9\textheight,keepaspectratio]{images/chapter_1/mas_schematic.pdf}
        \label{fig:enter-label}
    \end{figure}
\end{frame}

\begin{frame}{Example: Level-Based Foraging}
	\bcol
		\col{0.5}
		    \blist
		        \item Three agents (robots) with varying skill levels
		        \item Goal: to collect all items (apples)
		        \item Items can be collected if a group of one or more agents are located next to the item and the sum of agents' levels is greater than or equal to the item level
		        \item Action space $\Ac = \{up, down, left, right, collect, noop\}$
		    \elist
    	\col{0.45}
    		\centering
 	        \includegraphics[width=\textwidth]{images/environments/lbf/foraging_8x12_b.png}
 	 \ecol
\end{frame}

\begin{frame}{MARL for Solving Multi-Agent Systems}

    \begin{figure}
        \centering
        \includegraphics[width=.6\textwidth]{images/chapter_1/MARL-loop.pdf}
    \end{figure} 

    \blist
        \item {\bf Goal:} learn optimal policies for a set of agents in a multi-agent system
        \item Each agent chooses an action based on its policy $\Rightarrow$ joint action
        \item Joint action affects environment state; agents get rewards $+$ new observations
     \elist
\end{frame}

\begin{frame}{Why MARL?}
	\small

	Why should we use MARL to find optimal solutions to multi-agent systems rather than controlling multiple 'agents' using a single-agent RL (SARL) algorithm?% I.e. one agent controlling the actions of all agents. 

	\vspace{1.5em}

  \bcol
    \tcol{0.45}
    	\visible<2->{
        {\bf Decomposing a large problem}
        \blist
            \item In LBF example, controlling $3$ robots each with $6$ actions, the joint action space becomes $6^3 = 216$.
            \listtab Large action space for SARL!
            
            \item We can decompose this into three independent agents, each selecting from only 6 actions.
            \listtab Use MARL to train separate agent policies (more tractable)
        \elist}

    \tcol{0.45}
	    \visible<3->{
        {\bf Decentralized decision making}
        \blist
            \item There are many real-world scenarios where it is required for each agent to make decisions independently.
            \item E.g. autonomous driving is impractical for frequent long-distance data exchanges between a central agent and the vehicle.
        \elist}
	\ecol
\end{frame}

\begin{frame}{Challenges of MARL}
	\begin{graytitlebox}{New \fat{challenges} arise in MARL:}
    \blist
        \item<1-> Non-stationarity caused by multiple learning agents
        \item<2-> Optimality of policies and equilibrium selection
        \item<3-> Multi-agent credit assignment
        \item<4-> Scaling in number of agents
    \elist
    \end{graytitlebox}
    
    \visible<5->{We will explore these challenges in upcoming lectures!}
    
\end{frame}

	\begin{frame}{Challenges of Multi-Agent Learning}
		\bcol
			\col{0.50}
				\fat{Non-stationary environment:}
				
				\vspace{5pt}
				
				If multiple agents are learning, the environment becomes \e{non-stationary} from the perspective of individual agents
				
				\vspace{15pt}
				
				$\Rightarrow$ {\bf Moving target:} each agent is optimizing against changing policies of other agents
				
			\col{0.45}
				\vspace{10pt}
				\imgw{images/chapter_5/wolfphc_rps_own.pdf}{1.0}
		\ecol
	\end{frame}

\begin{frame}{Multi-Agent Credit Assignment}

{\bf Multi-agent credit assignment:} which agent's actions contributed to received rewards? \\[15pt]

\bcol
    \col{0.5}
            \centering
            \includegraphics[width=0.85\textwidth]{images/environments/lbf/foraging_8x12_b.png}

    \col{0.5}
        \blist
            \item At time step $t$ all agents perform \textit{collect}, each receiving reward $+1$
            \item Whose actions led to the reward?
            \item The agent on the left did not contribute
            \item Learning agents must disentangle the contributions of actions!
        \elist

\ecol
    
\end{frame}


\section{Part 2: Reinforcement Learning Basics}

	\begin{frame}{What is Reinforcement Learning?}
		\begin{graytitlebox}{Reinforcement learning (RL)}
			Learning to solve sequential decision problems via \e{repeated interaction with environment} %(trial and error)
		\end{graytitlebox}
		\vspace{10pt}
		\blist
			\item What is a sequential decision problem?
			\item What does it mean to ``solve'' the problem?
			\item What is learning by interaction?
		\elist
	\end{frame}

	\begin{frame}[t]{What is Reinforcement Learning?}
		\e{Agent} takes \e{actions} in \e{environment}
		\blist
			\item Take action, observe new \e{state} and \e{reward} from environment
			\item Goal is to maximise total rewards received
			\listtab Learning: find best actions by {\it trying} them
		\elist
		\begin{center}
			 \includegraphics[width=0.6\textwidth]{images/chapter_2/RL-loop.pdf}
		\end{center}
	\end{frame}
	
\begin{frame}{RL Problem Definition}

    \begin{center}
        \includegraphics[width=0.9\textwidth]{images/chapter_2/rl-learning-problem.pdf}
    \end{center}

    \blist
        \item Sequential decision process is defined formally as \e{Markov decision process} (MDP)
        \item Goal is to learn an optimal decision policy for a specific learning objective
    \elist
    
\end{frame}

\begin{frame}{Markov Decision Process -- Graph}
    
      \centering
      \includegraphics[width=1\textwidth]{images/1_mdp_diagram_elipses.pdf}
    
\end{frame}

\begin{frame}{Markov Decision Process --  Definition}
	\vspace{10pt}
	
  \bcol

    \col{0.5}
     MDP is defined as: \\[5pt]
     
      \blist
      	\itemsep=10pt
        \item \textcolor{blue}{$S$}: Finite set of states, with subset of terminal states $\bar{S} \subset S$
        \item \textcolor{red}{$A$}: Finite set of actions
        \item \textcolor{green}{$\mathcal{R}$}: Reward function $\mathcal{R}: \textcolor{blue}{S} \times \textcolor{red}{A} \times \textcolor{blue}{S} \to \mathbb{R}$
        \item $\mathcal{T}:$ State transition function $\mathcal{T}: \textcolor{blue}{S} \times \textcolor{red}{A} \times \textcolor{blue}{S} \to [0, 1]$
        \item $\mu$: Initial state distribution $\mu: S \to [0,1]$
        %such that $\sum_{s\in S}\mu (s) = 1$ and $\forall s \in \hat{S}: \mu (s) = 0$
      \elist

    \col{0.5}
        \includegraphics[width=\linewidth]{images/1_mdp_diagram.pdf}

  \ecol
\end{frame}

\begin{frame}{MDP Assumptions}

    \begin{columns}[T]
        \begin{column}{0.5\textwidth}
        	\visible<1->{
            \fat{\e{Markov property}}
            \blist
                \item Future states are temporally independent of past states and actions, given the current state and action. 
                \item $Pr(s^{t+1}, r^{t}|s^{t}, a^{t}, s^{t-1}, a^{t-1}, ...,s^0, a^0) = Pr(s^{t+1}, r^{t}|s^{t}, a^{t})$
                % \item Meaning that the current state provides all necessary information for an agent to choose the optimal action. 
            \elist }
            \vspace{10pt}
            \visible<2->{
            \fat{\e{Full observability}}
            \blist
                \item Agent can see the entire state of the environment. 
                \item In many applications, agent may only have partial view.
            \elist} 
        \end{column}
        
        \hspace{10pt}
        
        \begin{column}{0.5\textwidth}
            \visible<3->{
            \fat{\e{Stationarity}}
            \blist
                \item Dynamics of the environment are assumed to be stationary. 
                \item i.e. $\mathcal{T}$ and \textcolor{green}{$\mathcal{R}$} are constant through time
            \elist}
            \vspace{10pt}
            \visible<4->{
            \fat{\e{Incomplete knowledge of MDP}}
            \blist
                \item Agent may only have knowledge of the action and state spaces ($\textcolor{red}{A}, \textcolor{blue}{S}$)
                \item Transition and reward functions ($\mathcal{T}, \mathcal{R}$) are usually assumed to be \fat{unknown}.
            \elist}
        \end{column}
    \end{columns}
\end{frame}

\begin{frame}[t]{Mars Rover MDP Example}
	\vspace{10pt}
	
    \bcol
    
      \col{0.58}
          \includegraphics[width=1\textwidth]{images/chapter_2/mdp-rover.pdf}
    
      \col{0.40}
      \small
        \blist
          \item \textit{Start} is the initial state \(s^0\)
          \item Two possible actions \(A = \{right, left\}\)
          \item Goal is to get to \textit{Base} terminal state
          \item Rewards given by $\mathcal{R}(s, a, s')$ are shown as $r$
          \item State transition probabilities given by $\mathcal{T}(s, a, s)$, are shown as $p$
        \elist
    
    \ecol
\end{frame}

\begin{frame}{Expected Discounted Returns}

    Using RL, learn policy $\pi$ that maximizes the expected \textbf{return}:

    \blist
        \item Returns are the sum of rewards received over time
        \item If MDP is non-terminating (i.e. $t \to \infty$), we \textbf{discount} returns to ensure finite (discounted) returns
    \elist
    \vspace{0pt}
    $$
    \mathbb{E}_{\pi}[u_{t}] = \mathbb{E}_{\pi}\left[\sum_{t=0}^{\infty}\gamma^t r^t\right]
    $$

    \blist
        \item $\gamma$ is the discount factor, such that $\gamma \in [0, 1]$
        \item $\pi$ is the policy that determines which actions are chosen
    \elist
\end{frame}

\begin{frame}{State-Value Functions}
    \textbf{State-value} functions $V^{\pi}(s)$ give the 'value' of state $s$ when following the policy $\pi$:
    \begin{align*}
    V^{\pi}(s) = \mathbb{E}_\pi \left[u^{t} | s^{t} = s \right]
    \end{align*}
    \pause
    The return ($u$) can be recursively defined as:
    \begin{align*}
            u^t &= r^t +\gamma (r^{t+1} + \gamma r^{t+2} + ... )  \\ &= r^t + \gamma u^{t+1}
    \end{align*}
	\pause
    Therefore:
    \begin{align*}
        V^{\pi}(s) = \mathbb{E}_\pi \left[r^t + \gamma u^{t+1}| s^{t} = s \right]
    \end{align*}
\end{frame}

\begin{frame}[t]{The Bellman Equation}

    \begin{align*}
        V^{\pi}(\st) &= \ex{\pi}{\rew^t + \gamma u^{t+1} \mid \st^{t} = \st } \\[3pt]
                   &= \sum_{\ac \in \Ac} \pi(\ac \mid \st) \sum_{\st' \in \St} \Stf(\st'\mid \st, \ac) \left[\Rew(\st, \ac, \st') + \gamma \ex{\pi}{u^{t+1} \mid \st^{t+1} = \st'}\right] \\[3pt]
                   &= \sum_{\ac \in \Ac} \pi(\ac \mid \st) \sum_{\st' \in \St} \Stf(\st'\mid \st, \ac) \left[\Rew(\st, \ac, \st') + \gamma V^{\pi}(\st')\right]
    \end{align*}
    
    The last equation is known as the \e{Bellman equation} in honor of Richard Bellman. 
    
    \blist
        \item The value of being in state $s$ while following a fixed policy $\pi$ is equivalent to the immediate reward ($\Rew(\st, \ac, \st') \to \rew$) received when taking action $\ac$ in state $\st$ ($\pi(\ac \mid \st)$) plus the discounted value of the next state $\st'$. 
    \elist
\end{frame}

\begin{frame}{State-Action Value Function}
    \textbf{State-action} value functions $Q^{\pi}(s,a)$ are an extension on the \textit{State} value functions. They condition the expected return on the current state and the action taken:
    \begin{align*}
         Q^{\pi}(s,a) = \mathbb{E}_{\pi}\left[u^{t}|s^{t}=s, a^{t}=a\right]
    \end{align*}

The \textit{state-action} value Bellman equation is therefore:
\vspace{10pt}
\begin{align*}
    Q^{\pi}(s,a) = \sum_{s' \in S} \mathcal{T}(s'|s,a) \left[\mathcal{R}(s,a,s') + \gamma V^{\pi}(s')\right]
\end{align*}

\end{frame}

\begin{frame}{Greedy Policies}

    A policy can act \textbf{greedily}: choosing actions that have maximum estimated value.
    \vspace{10pt}
    
    Greedy $\pi$ using \textit{state value} functions:
    \vspace{5pt}    
    \begin{align*}
       \pi(s) = \argmax_{a \in A} \sum_{s', r} \mathcal{T}(s', r | s, a) \left[ r + \gamma V(s') \right]
    \end{align*}
    
    Or using the \textit{state-action value function}:
    \vspace{5pt}
    \begin{align*}
        \pi(s) = \argmax_{a \in A}Q(s, a)    
    \end{align*}
    
\end{frame}

\begin{frame}[t]{Optimal Greedy Policy}

   A greedy policy with respect to a value function is optimal only when using the \textbf{optimal value function.} 
    
    Optimal value function for a MDP is defined as:
    \vspace{5pt}
    \begin{align*}
        V^{*}(s) &= \max_{\pi'} V^{\pi'}(s), \quad \forall s \in S \\
        Q^{*}(s, a) &= \max_{\pi'} Q^{\pi'}(s, a), \quad \forall s \in S, a \in A
    \end{align*}

    Therefore, the optimal policy is:
    \vspace{5pt}
    \begin{align*}
        \pi^{*}(s) = \argmax_{a \in A}Q^{*}(s, a)    
    \end{align*}
\end{frame}

\begin{frame}{Dynamic Programming}
    \blist
    	\itemsep=10pt
        \item Dynamic Programming (DP) is a family of algorithms to compute \textbf{optimal value functions} and \textbf{optimal policies in MDPs} (Bellman 1957; Howard 1960).
        \item In DP, we \textbf{assume complete knowledge} of the MDP, including the transition and reward function ($\mathcal{T}, \mathcal{R}$).
        \item Given complete knowledge, we can use the \textbf{Bellman equation} to find optimal value functions and policies.
    \elist   
\end{frame}

\begin{frame}{Policy Iteration}
	\begin{bluetitlebox}{Policy Iteration}
    \fat{Policy Iteration} is a DP algorithm that alternates between two steps: \\
        \blist
            \item \textbf{Policy evaluation}: compute value function $V^{\pi}$ for current policy $\pi$
            \item \textbf{Policy improvement}: improve current policy $\pi$ with respect to $V^{\pi}$
        \elist 
	    \begin{align*}
	        \pi^{0} \to V^{\pi^{0}}\to \pi^{1} \to V^{\pi^{1} } \to\pi^{2} \to ... \to V^{*} \to \pi^{*} 
	    \end{align*}
    \end{bluetitlebox}

    % \begin{figure}
    %     \centering
    %     \includegraphics[width=\textwidth,height=.45\textheight,keepaspectratio]{images/1_policy_iteration.png}
    %     \label{fig:policy_iteration}
    % \end{figure}
\end{frame}

\begin{frame}[fragile]{Policy Iteration Pseudo Code}

\scalebox{0.8}{\begin{minipage}{\linewidth}
    \balg{Policy Iteration}{pi}
        \State Initialize deterministic policy $\pi$ randomly, and $V(s)=0$ for all $s \in S$
        \Repeat
            \State // {\it Iterative Policy Evaluation:}
            \Repeat
                \For{each state $s$ in $S$}
                    \State $V(s) \gets \sum_{s'} \mathcal{T}(s' | s, \pi(s)) [\mathcal{R}(s, \pi(s), s') + \gamma V(s')]$
                \EndFor
            \Until{$V(s)$ converges for all $s \in S$}
            
            \State // {\it Policy Improvement:}
            \State policy\_stable $\gets$ true
            \For{each state $s$ in $S$}
                \State old\_action $\gets \pi(s)$
                \State \Cboxed{red}{$\pi(s) \gets \arg\max_a \sum_{s'} \mathcal{T}(s' | s, a) [\mathcal{R}(s, a, s') + \gamma V(s')]$}
                \If{old\_action $\neq \pi(s)$}
                    \State policy\_stable $\gets$ false
                \EndIf
            \EndFor
        \Until{policy\_stable}
        \Return $V, \pi$
    \ealg
\end{minipage}%
    }
\end{frame}


\begin{frame}{Value Iteration}
	\textbf{Value Iteration} uses the \textbf{Bellman optimality equation}
    \blist
        \item Combines iterative policy evaluation and improvement into one update rule.
    \elist

    \textit{Bellman Optimality Equation as update operator:}
    \begin{align*}
        V^{k+1}(s) \gets \max_{a \in A} \sum_{s' \in S} \mathcal{T}(s' | s, a) [\mathcal{R}(s, a, s') + \gamma V^k(s')], \quad \forall s \in S
    \end{align*}

    \blist
        \item The $\max$-operator makes this the Bellman \textit{optimality} equation.
        \item The equation expresses the value of a state as the maximum expected return achievable by taking the best action and then following the optimal policy.
    \elist
\end{frame}

\begin{frame}[fragile]{Value Iteration Pseudo Code}

    \balg{Value Iteration}{vi}
        \State Initialize: $V(s) = 0, \forall s \in S$
        \Repeat
            \State \Cboxed{red}{$\forall s \in S: V(s) \gets \max_{a \in A} \sum_{s' \in S} \mathcal{T}(s' | s, a) \left[\mathcal{R}(s, a, s') + \gamma V(s')\right]$}
        \Until{$V$ converged}
        \Return{ optimal policy $\pi^{*}$ with:}
        \State $\forall s \in S: \argmax_{a \in A} \sum_{s' \in S}\mathcal{T}(s'|s, a)\left[\mathcal{R}(s, a, s')+\gamma V(s') \right]$
    \ealg

\end{frame}

\begin{frame}{Temporal-Difference Learning}
    \textbf{Temporal-Difference} (TD) learning is a family of RL algorithms that learn optimal policies and value functions based on data collected via environment \textbf{interactions}. 

    \blist
        \item These algorithms learn which actions yield the best returns by 
        \textbf{trial and error} and exploring different actions and states
    	\listtab Learns without \textbf{model} of environment's \textbf{reward} and \textbf{transition} functions ($\mathcal{R}, \mathcal{T}$)
        \listtab Can learn \textbf{online}, updating the policy while interacting with the environment
    \elist
\end{frame}

\begin{frame}{Temporal-Difference Update}
    The update for temporal-difference learning uses value functions:
    \vspace{0pt}
    \begin{align*}
        V(s^{t}) \gets V(s^{t}) + \alpha \left[\mathcal{X} - V(s^{t})\right]
    \end{align*}
    or 
    \vspace{0pt}
    \begin{align*}
        Q(s^{t}, a^{t}) \gets Q(s^{t}, a^{t})  + \alpha \left[\mathcal{X} - Q(s^{t}, a^{t})\right]
    \end{align*}
	\blist
    	\item $\mathcal{X}$ is the update target, serving as an estimate of the current state value. 
    	\item $\alpha$ is the learning rate
    \elist
\end{frame}

\begin{frame}{Temporal-Difference Update}
    
    In SARSA (a basic TD algorithm), we use the experience tuple $(\st^{t}, \ac^{t}, \rew^{t}, \st^{t+1}, \ac^{t+1})$ to construct a target:
    \begin{align*}
        \mathcal{X} = \rew^t + \gamma Q(\st^{t+1}, \ac^{t+1})
    \end{align*}
    
    (The immediate reward plus the discounted value of the next state)
    
    Note the resemblance to the Bellman equation:
    \begin{align*}
        Q^{\pi}(\st, \ac) = \sum_{\st' \in \St} \Stf(\st' \mid \st, \ac) \Cboxed{red}{$\left[ \Rew(\st, \ac, \st') + \gamma \sum\limits_{\ac' \in \Ac} \pi(\ac' \mid \st') Q^{\pi}(\st', \ac') \right]$}
    \end{align*}
\end{frame}

\begin{frame}{SARSA}
    
The SARSA update rule thus becomes:
$$
  Q(s^t, a^t) \leftarrow Q(s^t, a^t) + \alpha[r^t + \gamma Q(s^{t+1}, a^{t+1})- Q(s^t,a^t)] 
$$

\vspace{5pt}

\blist
	\itemsep=10pt
	\item The TD update is \textbf{bootstrapped}: using the value estimates of the next state ($Q(s^{t+1}, a^{t+1})$) to update the current state value ($Q(s^t,a^t)$)
    \item $\left( r^t + \gamma Q(s^{t+1}, a^{t+1})- Q(s^t,a^t) \right)$ is also called ``TD error''
\elist

\end{frame}

\begin{frame}[fragile]{SARSA Pseudo Code}
    \centering 
    \balg{SARSA}{sarsa}
        \State Initialize \( Q(s, a) = 0 \) for all \( s \in S, a \in A \)
        \For{every episode}
            \State Observe initial state \( s^0 \)
            \State With probability \( \epsilon \): choose random action \( a^0 \in A \)
            \State Otherwise: choose action \( a^0 \in \arg\max_a Q(s^0, a) \)
            \For{\( t = 0, 1, 2, \ldots \)}
                \State Apply action \( a^t \), observe reward \( r' \) and next state \( s^{t+1} \)
                \State With probability \( \epsilon \): choose random action \( a^{t+1} \in A \)
                \State Otherwise: choose action \( a^{t+1} \in \arg\max_a Q(s^{t+1}, a) \)
                \State \Cboxed{red}{\( Q(s^t, a^t) \gets Q(s^t, a^t) + \alpha [ r^{t} + \gamma Q(s^{t+1}, a^{t+1}) - Q(s^t, a^t) ] \)}
            \EndFor
        \EndFor
    \ealg
\end{frame}

\begin{frame}[t]{Convergence of SARSA}

    SARSA is guaranteed to converge to the optimal \textbf{state-value function}, for all $S \in S$ and $a \in A$, if:
    \blist
        \item<1-> All \textit{state-action} pairs are explored infinitely many times: $$\forall s \in S, a \in A: \sum_{k=0}^{\infty} \mathbb{I}(s, a) \to \infty $$

        \item<2-> The learning rate $\alpha$ is reduced over time according to the "standard stochastic approximation conditions": $$\forall s \in S, a \in A: \sum_{k=1}^{\infty} \alpha_k(s, a) \to \infty \quad \text{and} \quad \sum_{k=1}^{\infty} \alpha_k(s, a)^2 < \infty$$
    \elist        
\end{frame}

\begin{frame}{$\epsilon$-Greedy Policies}

    Using a \textbf{greedy policy} would \textbf{violate the convergence condition} of SARSA (infinite exploration of $S$ and $A$). 
    \vspace{0pt}
    \blist
        \item Intuitively, we must explore a wide range of states and actions to find state action combinations that yield high returns
        
        \item One solution is to add an \textbf{exploration} parameter $\epsilon \in [0, 1]$. This gives us a stochastic \textbf{epsilon-greedy} policy
    $$
        \pi(a|s) = \begin{cases} 
        1 - \epsilon + \frac{\epsilon}{|A|} & \text{if } a \in \arg\max_{a'} Q(s, a') \\
        \frac{\epsilon}{|A|} & \text{otherwise} 
    \end{cases}
    $$
    
        \item With probability $1-\epsilon$, the policy chooses the greedy action, and with probability $\epsilon$ chooses an action uniformly at random
    \elist
    
\end{frame}

\begin{frame}{Q-Learning}

    \e{Q-learning} (Watkins \& Dayan 1992) is a popular TD algorithm which uses the Bellman optimality equation to update its value function estimates.
    \vspace{10pt}
    \blist
        \item By using the Bellman optimality equation, Q-learning directly learns the \textbf{optimal state-action value function}
        \item Q-learning is \textbf{off-policy}, meaning the policy followed to gather experiences is different from the optimized policy
        \item We use the $\epsilon$-greedy policy to collect experiences
    \elist

\end{frame}

\begin{frame}{Q-Learning Update}

    The target in Q-learning uses the \textit{max} operator to target the optimal Q-values directly:
    \begin{align*}
        \mathcal{X} = r^{t} + \gamma \max_{a' \in A}Q(s^{t+1}, a')
    \end{align*}

    The Q-learning update is thus:
    \begin{align*}
        Q(s^t, a^t) \gets Q(s^t, a^t) + \alpha \left[ r^t + \gamma \max_{a' \in A} Q(s^{t+1}, a') - Q(s^t, a^t) \right]
    \end{align*}
    
\end{frame}

\begin{frame}{Q-Learning Pseudo Code}
    \centering
    \balg{Q-Learning}{ql}
        \State Initialize $Q(s, a) = 0$  for all $\st \in \St, \ac \in \Ac$
        \For{every episode}
            \For{$t = 0, 1, 2, \ldots$} 
                \State Observe current state $s^t$
                \State With probability $\epsilon$: choose random action $\ac^t \in \Ac$
                \State Otherwise: choose action $\ac^t \in \arg\max_{\ac} Q(\st^t, \ac)$
                \State Apply action $\ac^t$, observe reward $\rew^t$ and next state $\st^{t+1}$
                \State \Cboxed{red}{$Q(\st^t, \ac^t) \gets Q(\st^t, \ac^t) + \alpha \left[ \rew^t + \gamma \max_{\ac'} Q(\st^{t+1}, \ac') - Q(\st^t, \ac^t) \right]$}
            \EndFor
        \EndFor
    \ealg
\end{frame}


\begin{frame}{Evaluating RL Algorithms}

    \begin{columns}
    \begin{column}{0.5\textwidth}
        
        \begin{figure}
            \includegraphics[width=0.45\linewidth, height=0.45\textheight, keepaspectratio]{images/chapter_2/tabular_mdp_returns.pdf}
            \hspace{5pt} % Horizontal space between images
            % Row 1, Image 2
            \includegraphics[width=0.45\linewidth, height=0.45\textheight, keepaspectratio]{images/chapter_2/tabular_mdp_eplength.pdf}
        
            \vspace{5pt} % Vertical space between rows
        
            % Row 2, Image 3
            \includegraphics[width=0.45\linewidth, height=0.45\textheight, keepaspectratio]{images/chapter_2/tabular_mdp_ql_alphas_returns.pdf}
            \hspace{5pt} % Horizontal space between images
            % Row 2, Image 4
            \includegraphics[width=0.45\linewidth, height=0.45\textheight, keepaspectratio]{images/chapter_2/tabular_mdp_ql_eps_returns.pdf}
        \end{figure}
        \end{column}
        \begin{column}{0.5\textwidth}
            
            \textbf{Y-axis:}

            \blist
                \item {\small Average \textbf{discounted evaluation returns}. This shows us how our greedy policy would perform if we stopped learning at time step T.} 
                \item {\small In some cases, \textbf{undiscounted returns} are easier to interpret and may be used instead. }
            \elist
            
            \textbf{X-axis:}
            \blist 
                \item {\small Cumulative training steps across episodes.} 
                \item {\small Number of episodes can also be used. This might, however, distort the learning speed.} 
            \elist
        \end{column}
        
    \end{columns}

\end{frame}

% \begin{frame}{Evaluating RL Algorithms Continued}
% \small
%   \begin{columns}[T]
%     \begin{column}{0.6\textwidth}
%         \fat{Why Cumulative Time Steps and not n Episodes?}
%         \blist
%             \item Episodes can distort the speed at which an algorithm learns. 
%             \item Maybe, and the algorithm needs a few episodes to converge, but it takes a lot of time to explore in early episodes. 
%         \elist
        
%         \fat{Average Discounted Evaluation Returns.}
%         \blist
%             \item Returns are from a greedy policy with respect to learned action values after T time steps. 
%             \item We average to answer the following question: if we finish learning after T time steps and extract the greedy policy, what expected returns can we expect to achieve with this policy?
%         \elist
%     \end{column}

%     \begin{column}{0.4\textwidth}
%         \fat{What about Undiscounted Returns?}
%         \blist
%             \item We usually use discounted returns as this is the learning objective, but undiscounted rewards can sometimes be easier to interpret. 
%             % \item For example, suppose we want to learn an optimal policy for a video game in which the agent controls a spaceship and receives a +1 score (reward) for each destroyed enemy. If in an episode the agent destroys 10 enemies at various points in time, the undiscounted return will be 10 but the discounted return will be less than 10, making it more difficult to understand how many enemies the agent destroyed.
%         \elist
%     \end{column}

%   \end{columns}

% \end{frame}

\begin{frame}{Summary}

\fat{We covered:}
\blist
    \item Intro to multi-agent systems and MARL
    \item MDPs
    \item Value functions
    \item Dynamic programming
    \item Temporal-difference learning
\elist
\fat{Next we'll cover:}
\blist
    \item Games: models of multi-agent interaction
\elist

\end{frame}

\begin{frame}{Reading}

Richard Bellman: Dynamic Programming. Princeton University Press, 1957

Ronald Howard: Dynamic Programming and Markov Processes. John Wiley, 1960

Richard Sutton and Andrew Barto: Reinforcement learning: An introduction (2nd edition). MIT Press, 2018

\end{frame}
\end{document}
